
# coding: utf-8

# In[1]:


import pandas as pd


# In[2]:


media_cloud600 = pd.read_excel("Media Cloud 600 Topics.xlsx")

media_cloud_tagger = pd.read_excel("US News Sample - Media Cloud Tagger_above 100 words.xlsx")

ner = pd.read_excel("sentence_to_wiki.xlsx")


# In[3]:


categorys_lst = []
with open("categorys.txt") as f:
    categorys_lst = [category.strip('\n').split(',') for category in f.readlines()]


# In[4]:


media_cloud600_droped = media_cloud600[media_cloud600["Topic category"]!=99]


# In[5]:


ner = ner[ner["LABEL"]!="GPE"]


# In[6]:


replace_dict = dict()
for categorys in categorys_lst:
    if len(categorys)==2:
        replace_dict[categorys[1]] = categorys[0]


# In[7]:


def fix_topic_category(category):
    if category in replace_dict.keys():
        
        return replace_dict[category]
    else:
        return category


# In[8]:


media_cloud600_droped["Topic category"] = media_cloud600_droped["Topic category"].apply(fix_topic_category)
media_cloud600["Topic category"].replace(99, '', inplace=True)


# In[9]:


tag2category_dict = {}

for _, row in media_cloud600_droped.iterrows():
    tag = row["tag"]
    category = row["Topic category"]
    tag2category_dict[tag] = category


# In[10]:


replace_columns = ["Topic %s"%i for i in range(1,21)]


# In[11]:


def replace_topic(topic):
    return tag2category_dict.get(topic,'')


# In[12]:


for replace_column in replace_columns:
    ner[replace_column] = ner[replace_column].apply(replace_topic)
    media_cloud_tagger[replace_column] = media_cloud_tagger[replace_column].apply(replace_topic)


# In[13]:


ner["Lable And Topic 1"] = ner["LABEL"] + '_' + ner["Topic 1"]


# In[14]:


bi_ner = ner[["Article","Lable And Topic 1"]]


# In[15]:


count_ner = bi_ner.groupby(["Article","Lable And Topic 1"]).size().reset_index()


# In[16]:


stack_count_ner = count_ner.set_index(["Article","Lable And Topic 1"]).unstack('Lable And Topic 1')


# In[17]:


stack_count_ner.columns = stack_count_ner.columns.get_level_values(1)


# In[18]:


stack_count_ner = stack_count_ner.reset_index()


# In[19]:


result = pd.merge(media_cloud_tagger,stack_count_ner,on="Article",how='left')


# In[20]:


result = result.fillna(0)


# In[21]:


cloud_dump = result.copy()


# In[22]:


cloud_article_topics = media_cloud_tagger[["Article"]+replace_columns]


# In[23]:


category_lst = list(set(tag2category_dict.values()))


# In[24]:


def topic_category(cate):
    return "TOPIC_"+cate


# In[25]:


D = {}
for category in category_lst:
    D[category] = []
D["Unknown"] = []
D["Article"] = []


# In[26]:


def article_topic_invert(row_data):
    article = row_data["Article"]
    topics = list(row_data[1:4])
    unique_topics = list(set(topics))
    for category in category_lst+["Unknown"]:
        if category in unique_topics:
            D[category] += [topics.index(category)+1]
        else:
            D[category] += [0]
    D["Article"] += [article]      


# In[27]:


cloud_article_topics.apply(article_topic_invert,axis=1)


# In[28]:


df2 = pd.DataFrame(D)


# In[29]:


column_name = {column:"TOPIC_"+column for column in df2.columns[:-2]}


# In[30]:


df2.rename(column_name,axis=1,inplace=True)


# In[31]:


merge_df = pd.merge(df2,result,on="Article")


# In[32]:


columns  = sorted(list(merge_df.columns))


# In[33]:


for column in list(media_cloud_tagger.columns):
    columns.remove(column)


# In[34]:


columns = list(media_cloud_tagger.columns) + columns


# In[35]:


merge_df = merge_df[columns]


# In[36]:


#new added


# In[37]:


from numpy import nan


# In[38]:


merge_df.replace(0,nan,inplace=True)
merge_df.dropna(axis=1,how='all',inplace=True)
merge_df.dropna(subset=list(merge_df.columns[52:]),how='all',inplace=True)
df_list=list(merge_df.columns)


df = merge_df[df_list]
df.replace(nan,0,inplace=True)
df.to_excel('result.xlsx')

